#delimit;
clear;
capture log close;
set more off;

local path3 "/data";
local pathtab "/data";
local path4 "/data";
local pathtables "/data";
local out "/data";

clear;

use "`path3'/long_IPEDS_80s.dta", clear;
drop if type=="Private Nonprofit Four-Year Tuition and Fees" | type=="Public Two-Year In-State Tuition and Fees";
sort state year;
capture drop _merge;
sort year;

rename year year_enroll;

sort state year_enroll;
drop type;

gen state_t="AK" if state=="Alaska";
replace state_t="AL" if state=="Alabama";
replace state_t="AR" if state=="Arkansas";
replace state_t="AZ" if state=="Arizona";
replace state_t="CA" if state=="California";
replace state_t="CO" if state=="Colorado";
replace state_t="CT" if state=="Connecticut";
replace state_t="DC" if state=="District of Columbia";
replace state_t="DE" if state=="Delaware";
replace state_t="FL" if state=="Florida";
replace state_t="GA" if state=="Georgia";
replace state_t="HI" if state=="Hawaii";
replace state_t="IA" if state=="Iowa";
replace state_t="ID" if state=="Idaho";
replace state_t="IL" if state=="Illinois";
replace state_t="IN" if state=="Indiana";
replace state_t="KS" if state=="Kansas";
replace state_t="KY" if state=="Kentucky";
replace state_t="LA" if state=="Louisiana";
replace state_t="MA" if state=="Massachusetts";
replace state_t="MD" if state=="Maryland";
replace state_t="ME" if state=="Maine";
replace state_t="MI" if state=="Michigan";
replace state_t="MN" if state=="Minnesota";
replace state_t="MS" if state=="Mississippi";
replace state_t="MO" if state=="Missouri";
replace state_t="MT" if state=="Montana";
replace state_t="NC" if state=="North Carolina";
replace state_t="ND" if state=="North Dakota";
replace state_t="NE" if state=="Nebraska";
replace state_t="NH" if state=="New Hampshire";
replace state_t="NJ" if state=="New Jersey";
replace state_t="NM" if state=="New Mexico";
replace state_t="NV" if state=="Nevada";
replace state_t="NY" if state=="New York";
replace state_t="OH" if state=="Ohio";
replace state_t="OK" if state=="Oklahoma";
replace state_t="OR" if state=="Oregon";
replace state_t="PA" if state=="Pennsylvania";
replace state_t="PR" if state=="Puerto Rico";
replace state_t="RI" if state=="Rhode Island";
replace state_t="SC" if state=="South Carolina";
replace state_t="SD" if state=="South Dakota";
replace state_t="TN" if state=="Tennessee";
replace state_t="TX" if state=="Texas";
replace state_t="UT" if state=="Utah";
replace state_t="VA" if state=="Virginia";
replace state_t="VT" if state=="Vermont";
replace state_t="WA" if state=="Washington";
replace state_t="WI" if state=="Wisconsin";
replace state_t="WV" if state=="West Virginia";
replace state_t="WY" if state=="Wyoming";

drop state;
rename state_t first_state;
/*Expressing tuition in 1000 of dollars*/
replace tuition=tuition/1000;

reshape wide tuition, i(first_state) j(year_enroll);

sort first_state;
compress;
save "`pathtab'/Tuition_state.dta", replace;

/*Now to get tuition in public 2 year schools*/
use "`path3'/long_IPEDS_80s.dta", clear;
drop if type~="Public Two-Year In-State Tuition and Fees";
sort state year;
capture drop _merge;
sort year;

rename year year_enroll;

sort state year_enroll;
drop type;

gen state_t="AK" if state=="Alaska";
replace state_t="AL" if state=="Alabama";
replace state_t="AR" if state=="Arkansas";
replace state_t="AZ" if state=="Arizona";
replace state_t="CA" if state=="California";
replace state_t="CO" if state=="Colorado";
replace state_t="CT" if state=="Connecticut";
replace state_t="DC" if state=="District of Columbia";
replace state_t="DE" if state=="Delaware";
replace state_t="FL" if state=="Florida";
replace state_t="GA" if state=="Georgia";
replace state_t="HI" if state=="Hawaii";
replace state_t="IA" if state=="Iowa";
replace state_t="ID" if state=="Idaho";
replace state_t="IL" if state=="Illinois";
replace state_t="IN" if state=="Indiana";
replace state_t="KS" if state=="Kansas";
replace state_t="KY" if state=="Kentucky";
replace state_t="LA" if state=="Louisiana";
replace state_t="MA" if state=="Massachusetts";
replace state_t="MD" if state=="Maryland";
replace state_t="ME" if state=="Maine";
replace state_t="MI" if state=="Michigan";
replace state_t="MN" if state=="Minnesota";
replace state_t="MS" if state=="Mississippi";
replace state_t="MO" if state=="Missouri";
replace state_t="MT" if state=="Montana";
replace state_t="NC" if state=="North Carolina";
replace state_t="ND" if state=="North Dakota";
replace state_t="NE" if state=="Nebraska";
replace state_t="NH" if state=="New Hampshire";
replace state_t="NJ" if state=="New Jersey";
replace state_t="NM" if state=="New Mexico";
replace state_t="NV" if state=="Nevada";
replace state_t="NY" if state=="New York";
replace state_t="OH" if state=="Ohio";
replace state_t="OK" if state=="Oklahoma";
replace state_t="OR" if state=="Oregon";
replace state_t="PA" if state=="Pennsylvania";
replace state_t="PR" if state=="Puerto Rico";
replace state_t="RI" if state=="Rhode Island";
replace state_t="SC" if state=="South Carolina";
replace state_t="SD" if state=="South Dakota";
replace state_t="TN" if state=="Tennessee";
replace state_t="TX" if state=="Texas";
replace state_t="UT" if state=="Utah";
replace state_t="VA" if state=="Virginia";
replace state_t="VT" if state=="Vermont";
replace state_t="WA" if state=="Washington";
replace state_t="WI" if state=="Wisconsin";
replace state_t="WV" if state=="West Virginia";
replace state_t="WY" if state=="Wyoming";

drop state;
rename state_t first_state;
/*Expressing tuition in 1000 of dollars*/
replace tuition=tuition/1000;

rename tuition tuition_p2;

reshape wide tuition, i(first_state) j(year_enroll);

sort first_state;
compress;
merge 1:1 first_state using "`pathtab'/Tuition_state.dta";
drop _m;
save "`pathtab'/Tuition_state_pub4_pub2.dta", replace;

/*Now to get tuition in public 2 year schools*/
use "`path3'/long_IPEDS_80s.dta", clear;
drop if type~="Private Nonprofit Four-Year Tuition and Fees";
sort state year;
capture drop _merge;
sort year;

rename year year_enroll;

sort state year_enroll;
drop type;

gen state_t="AK" if state=="Alaska";
replace state_t="AL" if state=="Alabama";
replace state_t="AR" if state=="Arkansas";
replace state_t="AZ" if state=="Arizona";
replace state_t="CA" if state=="California";
replace state_t="CO" if state=="Colorado";
replace state_t="CT" if state=="Connecticut";
replace state_t="DC" if state=="District of Columbia";
replace state_t="DE" if state=="Delaware";
replace state_t="FL" if state=="Florida";
replace state_t="GA" if state=="Georgia";
replace state_t="HI" if state=="Hawaii";
replace state_t="IA" if state=="Iowa";
replace state_t="ID" if state=="Idaho";
replace state_t="IL" if state=="Illinois";
replace state_t="IN" if state=="Indiana";
replace state_t="KS" if state=="Kansas";
replace state_t="KY" if state=="Kentucky";
replace state_t="LA" if state=="Louisiana";
replace state_t="MA" if state=="Massachusetts";
replace state_t="MD" if state=="Maryland";
replace state_t="ME" if state=="Maine";
replace state_t="MI" if state=="Michigan";
replace state_t="MN" if state=="Minnesota";
replace state_t="MS" if state=="Mississippi";
replace state_t="MO" if state=="Missouri";
replace state_t="MT" if state=="Montana";
replace state_t="NC" if state=="North Carolina";
replace state_t="ND" if state=="North Dakota";
replace state_t="NE" if state=="Nebraska";
replace state_t="NH" if state=="New Hampshire";
replace state_t="NJ" if state=="New Jersey";
replace state_t="NM" if state=="New Mexico";
replace state_t="NV" if state=="Nevada";
replace state_t="NY" if state=="New York";
replace state_t="OH" if state=="Ohio";
replace state_t="OK" if state=="Oklahoma";
replace state_t="OR" if state=="Oregon";
replace state_t="PA" if state=="Pennsylvania";
replace state_t="PR" if state=="Puerto Rico";
replace state_t="RI" if state=="Rhode Island";
replace state_t="SC" if state=="South Carolina";
replace state_t="SD" if state=="South Dakota";
replace state_t="TN" if state=="Tennessee";
replace state_t="TX" if state=="Texas";
replace state_t="UT" if state=="Utah";
replace state_t="VA" if state=="Virginia";
replace state_t="VT" if state=="Vermont";
replace state_t="WA" if state=="Washington";
replace state_t="WI" if state=="Wisconsin";
replace state_t="WV" if state=="West Virginia";
replace state_t="WY" if state=="Wyoming";

drop state;
rename state_t first_state;
/*Expressing tuition in 1000 of dollars*/
replace tuition=tuition/1000;

rename tuition tuition_priv4;

reshape wide tuition_priv4, i(first_state) j(year_enroll);

sort first_state;
compress;
merge 1:1 first_state using "`pathtab'/Tuition_state_pub4_pub2.dta";
drop _m;
save "`pathtab'/Tuition_state_pub4_pub2_priv4.dta", replace;

erase "`pathtab'/Tuition_state_pub4_pub2.dta";
erase "`pathtab'/Tuition_state.dta";


/*Now the unemployment data at the county level, from Bureau of Labor Statistics Local Area Unemployment Statistics Series*/
use "`path3'/unemployment.dta", clear;

drop county laus t;
rename state_fips state;
rename county_fips county;

gen state_t="AK" if state=="02";
replace state_t="AL" if state=="01";
replace state_t="AR" if state=="05";
replace state_t="AZ" if state=="04";
replace state_t="CA" if state=="06";
replace state_t="CO" if state=="08";
replace state_t="CT" if state=="09";
replace state_t="DC" if state=="11";
replace state_t="DE" if state=="10";
replace state_t="FL" if state=="12";
replace state_t="GA" if state=="13";
replace state_t="HI" if state=="15";
replace state_t="IA" if state=="19";
replace state_t="ID" if state=="16";
replace state_t="IL" if state=="17";
replace state_t="IN" if state=="18";
replace state_t="KS" if state=="20";
replace state_t="KY" if state=="21";
replace state_t="LA" if state=="22";
replace state_t="MA" if state=="25";
replace state_t="MD" if state=="24";
replace state_t="ME" if state=="23";
replace state_t="MI" if state=="26";
replace state_t="MN" if state=="27";
replace state_t="MS" if state=="28";
replace state_t="MO" if state=="29";
replace state_t="MT" if state=="30";
replace state_t="NC" if state=="37";
replace state_t="ND" if state=="38";
replace state_t="NE" if state=="31";
replace state_t="NH" if state=="33";
replace state_t="NJ" if state=="34";
replace state_t="NM" if state=="35";
replace state_t="NV" if state=="32";
replace state_t="NY" if state=="36";
replace state_t="OH" if state=="39";
replace state_t="OK" if state=="40";
replace state_t="OR" if state=="41";
replace state_t="PA" if state=="42";
replace state_t="PR" if state=="72";
replace state_t="RI" if state=="44";
replace state_t="SC" if state=="45";
replace state_t="SD" if state=="46";
replace state_t="TN" if state=="47";
replace state_t="TX" if state=="48";
replace state_t="UT" if state=="49";
replace state_t="VA" if state=="51";
replace state_t="VT" if state=="50";
replace state_t="WA" if state=="53";
replace state_t="WI" if state=="55";
replace state_t="WV" if state=="54";
replace state_t="WY" if state=="56";

drop state;
rename state_t state;
gen u_rate=unemployed/(unemployed+employed)*100;

bys state year: egen sumemployed=sum(employed);
bys state year: egen sumunemployed=sum(unemployed);
gen urate_state=sumunemployed/(sumunemployed+sumemployed)*100;

drop labor_force employed unemployed urate;
drop sumunemployed sumemployed;
rename u_rate urate_county;
rename year year_urate;

drop urate_county;
sort state year_urate;
bys state year_urate: keep if _n==1;
drop county;

rename urate_state urate;

/*I need to reshape the data to be able to calculate the rates by age, as in the new apporach*/
reshape wide urate, i(state) j(year_urate);
rename state state_u;
save "`pathtab'/urate_state.dta", replace;


/*Now the average weekly wage from the QCEW at the state level*/
use "`path3'/state_wages2.dta", clear;
drop area_fips total_qtrly_wages;

sort year qtr;

*rename qtr quarter;
*rename avg_wkly_wage avg_wkly_wage_firststate;
rename year year_w;
sort state year_w qtr;

replace qtr=qtr*3;
qui sum year_w;
forvalues year=`r(min)'(1)`r(max)' {; /*1990-2014*/
forvalues quarter=3(3)12 {;
gen avg_wkly_wage_`year'_`quarter't=avg_wkly_wage if year_w==`year' & qtr==`quarter';
egen avg_wkly_wage_`year'_`quarter'=max(avg_wkly_wage_`year'_`quarter't);
drop avg_wkly_wage_`year'_`quarter't;
};
};
drop avg_wkly_wage qtr year_w;
rename state state_w;
by state_w: keep if _n==1;

save "`pathtab'/state_wages_tomerge.dta", replace;


/*Now I work on the main dataset*/
use "`out'/dataset_for_regs_RandR_noCB.dta", clear;

sort pid year;
/*I will assume that if first_pell_year is, for example, 2003, enrollment started in september
of that year*/
gen min_first_pell_year_mdy=mdy(9,30,award_yr_pell);
format %td min_first_pell_year_mdy;

/*first_col_from_grad_nsc=0 when it should be missing. I will replace it*/
replace first_col_from_grad_nsc=. if first_col_from_grad_nsc==0;

gen earliest_enrollment=min(student_start1, student_start2, student_start3, student_start4, student_start5, student_start6, min_first_pell_year_mdy, first_col_enroll_nsc, first_col_enroll_nslds);
gen earliest_enrollment_nopell=min(student_start1, student_start2, student_start3, student_start4, student_start5, student_start6, first_col_enroll_nsc, first_col_enroll_nslds);
gen earliest_enrollment_nopell_noTU=min(first_col_enroll_nsc, first_col_enroll_nslds);

format %td earliest_enrollment;
format %td earliest_enrollment_nopell;
format %td earliest_enrollment_nopell_noTU;

/*Now I will create variables saying from where the first enrollment is coming from.
I should use these variables, for example, if I want to constraint some regressions
to certain categories*/
gen f_enroll_NSC=(earliest_enrollment==first_col_enroll_nsc & first_col_from_grad_nsc~=1 & earliest_enrollment~=.);
gen f_enroll_NSLDS=(earliest_enrollment==first_col_enroll_nslds & f_enroll_NSC~=1 & earliest_enrollment~=.);
gen f_enroll_TU=((earliest_enrollment==student_start1 | earliest_enrollment==student_start2 | earliest_enrollment==student_start3 | earliest_enrollment==student_start4 | earliest_enrollment==student_start5 | earliest_enrollment==student_start6) 
	& f_enroll_NSC~=1 & f_enroll_NSLDS~=1 & earliest_enrollment~=.);
gen f_enroll_pell=(earliest_enrollment==min_first_pell_year_mdy & f_enroll_NSC~=1 & f_enroll_NSLDS~=1 & f_enroll_TU~=1 & earliest_enrollment~=.); /*From Pell*/
gen f_enroll_grad=(earliest_enrollment==first_col_enroll_nsc & first_col_from_grad_nsc==1 & f_enroll_NSC~=1 & f_enroll_NSLDS~=1 & f_enroll_TU~=1 & f_enroll_pel~=1 & earliest_enrollment~=.); /*From Graduation Date*/


gen dif_TU_NSC=first_col_enroll_nsc-earliest_enrollment if first_col_enroll_nsc~=. & f_enroll_TU==1;
gen dif_TU_NSLDS=first_col_enroll_nslds-earliest_enrollment if first_col_enroll_nslds~=. & f_enroll_TU==1;

gen earliest_enrollment_new=earliest_enrollment;
replace earliest_enrollment_new=earliest_enrollment_nopell_noTU if ((dif_TU_NSC>0 & dif_TU_NSC<366) |  (dif_TU_NSLDS>0 & dif_TU_NSLDS<366)) & first_col_from_grad_nsc~=1 & f_enroll_TU==1; 


gen dif_Pell_NSC=first_col_enroll_nsc-earliest_enrollment if first_col_enroll_nsc~=. & f_enroll_pell==1;
gen dif_Pell_NSLDS=first_col_enroll_nslds-earliest_enrollment if first_col_enroll_nslds~=. & f_enroll_pell==1;

replace earliest_enrollment_new=earliest_enrollment_nopell_noTU if ((dif_Pell_NSC>0 & dif_Pell_NSC<366) |  (dif_Pell_NSLDS>0 & dif_Pell_NSLDS<366)) & first_col_from_grad_nsc~=1 & f_enroll_pell==1; 

format %td earliest_enrollment_new;

gen f_enroll_NSC_new=(earliest_enrollment_new==first_col_enroll_nsc & first_col_from_grad_nsc~=1 & earliest_enrollment_new~=.);
gen f_enroll_NSLDS_new=(earliest_enrollment_new==first_col_enroll_nslds & f_enroll_NSC_new~=1 & earliest_enrollment_new~=.);
gen f_enroll_TU_new=((earliest_enrollment_new==student_start1 | earliest_enrollment_new==student_start2 | earliest_enrollment_new==student_start3 | earliest_enrollment_new==student_start4 | earliest_enrollment_new==student_start5 | earliest_enrollment_new==student_start6)
	& f_enroll_NSC_new~=1 & f_enroll_NSLDS_new~=1 & earliest_enrollment_new~=.);
gen f_enroll_pell_new=(earliest_enrollment_new==min_first_pell_year_mdy & f_enroll_NSC_new~=1 & f_enroll_NSLDS_new~=1 & f_enroll_TU_new~=1 & earliest_enrollment_new~=.); /*From Pell*/
gen f_enroll_grad_new=(earliest_enrollment_new==first_col_enroll_nsc & first_col_from_grad_nsc==1 & f_enroll_NSC_new~=1 & f_enroll_NSLDS_new~=1 & f_enroll_TU_new~=1 & f_enroll_pell_new~=1 & earliest_enrollment_new~=.); /*From Graduation Date*/

/*Now I want to generate the age at first enrollment, for both measures*/
gen year_earliest_enrollment=year(earliest_enrollment);
gen year_earliest_enrollment_new=year(earliest_enrollment_new);
gen month_earliest_enrollment=month(earliest_enrollment);
gen month_earliest_enrollment_new=month(earliest_enrollment_new);

sort pid year;
local var="earliest_enrollment earliest_enrollment_new";
foreach i of local var {;
gen period_`i'=1997 if year_`i'<1997 | (year_`i'==1997 & month_`i'<=6);
replace period_`i'=1999 if year_`i'==1998 | (year_`i'==1997 & month_`i'>6) | (year_`i'==1999 & month_`i'<=6);
replace period_`i'=2001 if year_`i'==2000 | (year_`i'==1999 & month_`i'>6) | (year_`i'==2001 & month_`i'<=6);
replace period_`i'=2003 if year_`i'==2002 | (year_`i'==2001 & month_`i'>6) | (year_`i'==2003 & month_`i'<=6);
replace period_`i'=2004 if year_`i'==2004 | (year_`i'==2003 & month_`i'>6);
replace period_`i'=2007 if year_`i'==2005 | year_`i'==2006 | (year_`i'==2007 & month_`i'<=6);
replace period_`i'=2008 if year_`i'==2008 | (year_`i'==2007 & month_`i'>6);
replace period_`i'=2010 if year_`i'==2009 | (year_`i'==2010 & month_`i'<=6);
replace period_`i'=2012 if year_`i'==2011 | (year_`i'==2010 & month_`i'>6) | (year_`i'==2012 & month_`i'<=6);
replace period_`i'=2014 if year_`i'==2013 | (year_`i'==2014 & month_`i'<=6);
};

gen day_of_TU=day_of_TU_1997 if year==1997;
replace day_of_TU=day_of_TU_1999 if year==1999;
replace day_of_TU=day_of_TU_2001 if year==2001;
replace day_of_TU=day_of_TU_2003 if year==2003;
replace day_of_TU=day_of_TU_2004 if year==2004;
replace day_of_TU=day_of_TU_2007 if year==2007;
replace day_of_TU=day_of_TU_2008 if year==2008;
replace day_of_TU=day_of_TU_2010 if year==2010;
replace day_of_TU=day_of_TU_2012 if year==2012;
replace day_of_TU=day_of_TU_2014 if year==2014;
format day_of_TU %td;

gen age_earliest_enrollment_t=age-(day_of_TU_2004-earliest_enrollment)/365 if year==2004;
gen age_earliest_enrollment_new_t=age-(day_of_TU_2004-earliest_enrollment_new)/365 if year==2004;
by pid: egen age_earliest_enrollment=max(age_earliest_enrollment_t);
by pid: egen age_earliest_enrollment_new=max(age_earliest_enrollment_new_t);

/*Now I will create location variables*/
sort pid year;
/*Checking when we observe a zip for the first time, and at what age*/
local var="zip county tract bg state";
foreach i of local var {;
sort pid year;
gen with`i'_t=1 if `i'~="";
replace with`i'_t=1 if `i'=="" & with`i'_t[_n-1]==1 & pid==pid[_n-1];
by pid: gen with`i'=sum(with`i'_t);
gen first`i'_t=`i' if with`i'==1;

gen firstage_`i't=age if first`i'_t~="";
by pid: egen firstage_`i'=max(firstage_`i't);
gen period_first`i'_t=year if first`i'_t~="";

gsort pid -first`i'_t;
gen first`i'=first`i'_t;
replace first`i'=first`i'[_n-1] if first`i'[_n-1]~="" & pid==pid[_n-1];
gen dif_age_`i'=age-age_earliest_enrollment if with`i'==1;
by pid: egen max_dif_age_`i'=max(dif_age_`i');

gen dif_age_`i'_new=age-age_earliest_enrollment_new if with`i'==1;
by pid: egen max_dif_age_`i'_new=max(dif_age_`i'_new);
};

/********************
Now I will create a state variable based on TU info only for when we observe that
info before age 23, to follow Editor's request
*********************/
gen firststate23=firststate if firstage_state<23;

sort pid year;
gen first_col_state=first_col_state_nsc if (f_enroll_NSC==1 | f_enroll_grad==1);
replace first_col_state=first_col_state_nslds if f_enroll_NSLDS==1;
replace first_col_state=first_col_state_pell if f_enroll_pell==1;
replace first_col_state=firststate if f_enroll_TU==1 & max_dif_age_state<=0;

gen first_col_state_new=first_col_state_nsc if (f_enroll_NSC_new==1 | f_enroll_grad_new==1);
replace first_col_state_new=first_col_state_nslds if f_enroll_NSLDS_new==1;
replace first_col_state_new=first_col_state_pell if f_enroll_pell_new==1;
replace first_col_state_new=firststate if f_enroll_TU_new==1 & max_dif_age_state_new<=0;


gen first_col_state_earliest=first_col_state;
replace first_col_state_earliest=firststate if /*hs_state_2=="" &*/ max_dif_age_state<0 & firststate~="";

gen first_col_state_earliest_new=first_col_state_new;
replace first_col_state_earliest_new=firststate if /*hs_state_2=="" &*/ max_dif_age_state_new<0 & firststate~="";

/*************************************
I'll replace state information
for those that didn't go to college with information from TU
**************************************/
replace first_col_state=firststate if first_col_state=="" & firststate~="";
replace first_col_state_new=firststate if first_col_state_new=="" & firststate~="";
replace first_col_state_earliest=firststate if first_col_state_earliest=="" & firststate~="";
replace first_col_state_earliest_new=firststate if first_col_state_earliest_new=="" & firststate~="";

/*Counting observations to know how home_state was determined*/
preserve;
	bys pid: keep if year==2004;
	gen todrop=(pid=="CQ372100006084" | pid=="CQ372100009510" | pid=="CQ372200002745" | 
	pid=="CQ372200003550" | pid=="CQ372400000387" | pid=="CQ372400000600" | pid=="CQ372400001543" | 
	pid=="CQ372400003695" | pid=="CQ372400004137" | pid=="CQ372400005128" | pid=="CQ372400005720" | 
	pid=="CQ372400006424" | pid=="CQ372500000301" | pid=="CQ372500000380" | pid=="CQ372500001977" | 
	pid=="CQ372500006063" | pid=="CQ372500007637" | pid=="CQ372500008864" | pid=="CQ372600001756" |
	pid=="CQ372600004821" | pid=="CQ372700001705" | pid=="CQ372700002435" | pid=="CQ372700003380" | 
	pid=="CQ372700006709" | pid=="CQ372700006975" | pid=="CQ372800000154" | pid=="CQ372800005549" | 
	pid=="CQ372900002337" | pid=="CQ372900004187" | pid=="CQ372900005785" | pid=="CQ372900008630" | 
	pid=="CQ373100001269" | pid=="CQ373100001603" | pid=="CQ373100006188" | pid=="CQ373200000787" | 
	pid=="CQ373200001594" | pid=="CQ373200002688" | pid=="CQ373200004182" | pid=="CQ373200004581" |
	pid=="CQ373200004691" | pid=="CQ373200006989" | pid=="CQ373200007213" | pid=="CQ373200008162" | 
	pid=="CQ373200008733" | pid=="CQ373300000792" | pid=="CQ373300002947" | pid=="CQ373300004204" | 
	pid=="CQ373300004205" | pid=="CQ373400000626" | pid=="CQ373400002904" | pid=="CQ373500000720" | 
	pid=="CQ373600004108" | pid=="CQ373600004759" | pid=="CQ373600006182" | pid=="CQ373600007420" | 
	pid=="CQ373700002243" | pid=="CQ373700002715" | pid=="CQ373700005844" | pid=="CQ373700009096" |
	pid=="CQ373800002129" | pid=="CQ373800002271" | pid=="CQ373800002364" | pid=="CQ373900003633" | 
	pid=="CQ373900006494" | pid=="CQ373900007014" | pid=="9EA00300005576" | pid=="9EA00400005572" | 
	pid=="9EA00500002667" | pid=="9EA00500005465" | pid=="9EA00700007009" | pid=="9EA00800009275" | 
	pid=="CQ370200000953" | pid=="CQ370200001939" | pid=="CQ370200003277" | pid=="CQ370300005225" | 
	pid=="CQ370300007037" | pid=="CQ370300008687" | pid=="CQ370400003593" | pid=="CQ370400005529" |
	pid=="CQ370400007514" | pid=="CQ370400008412" | pid=="CQ370500001079" | pid=="CQ370500001258" | 
	pid=="CQ370500003356" | pid=="CQ370500006624" | pid=="CQ370500006805" | pid=="CQ370500007723" | 
	pid=="CQ370500008189" | pid=="CQ370600001023" | pid=="CQ370600008519" | pid=="CQ370600010249" | 
	pid=="CQ370700001187" | pid=="CQ370700004542" | pid=="CQ370700005882" | pid=="CQ370800000424" | 
	pid=="CQ370800005683" | pid=="CQ370800006851" | pid=="CQ370900001216" | pid=="CQ370900001674" |
	pid=="CQ370900002376" | pid=="CQ370900005657" | pid=="CQ370900008437" | pid=="CQ371100002719" | 
	pid=="CQ371100003439" | pid=="CQ371100003597" | pid=="CQ371100004633" | pid=="CQ371100005460" | 
	pid=="CQ371100008563" | pid=="CQ371100009361" | pid=="CQ371200004861" | pid=="CQ371200008210" | 
	pid=="CQ371200008915" | pid=="CQ371300001384" | pid=="CQ371300001647" | pid=="CQ371300001711" | 
	pid=="CQ371300001875" | pid=="CQ371300002833" | pid=="CQ371300004506" | pid=="CQ371300006181" |
	pid=="CQ371400004368" | pid=="CQ371400005464" | pid=="CQ371400005694" | pid=="CQ371400007372" | 
	pid=="CQ371500000124" | pid=="CQ371500003405" | pid=="CQ371500006187" | pid=="CQ371500007891" | 
	pid=="CQ371600000271" | pid=="CQ371600006663" | pid=="CQ371700000393" | pid=="CQ371700003661" | 
	pid=="CQ371700004893" | pid=="CQ371700005284" | pid=="CQ371700006966" | pid=="CQ371800001716" | 
	pid=="CQ371800004333" | pid=="CQ371800006108" | pid=="CQ371800007280" | pid=="CQ371800007646" |
	pid=="CQ371800008647" | pid=="CQ371900003690");

	drop if todrop==1;
	
	keep pid max_dif_age_state_new f_enroll_NSC_new f_enroll_NSLDS_new f_enroll_TU_new f_enroll_grad_new f_enroll_pell_new first_col_state_earliest_new firststate first_col_state_nsc first_col_state_nslds first_col_state_pell;
	dis _N; /*34,750*/
	count if first_col_state_earliest_new==""; /*34,744*/
	/*those with state available before going to school*/
	count if first_col_state_earliest_new==firststate & max_dif_age_state_new<0 & firststate~=""; /*7,987*/
	/*those who went to school*/
	count if max_dif_age_state_new>=0 & max_dif_age_state_new<. & first_col_state_earliest_new~=""; /*16,942*/
	/*those who didn't go to school*/
	count if max_dif_age_state_new==. & first_col_state_earliest_new~=""; /*9,815*/
restore;

preserve;
keep pid year first_col_state_earliest_new;
rename first_col_state_earliest_new state_noCB;

save "`out'/state_noCB_temp.dta", replace;
restore;

/**********Checking the correlation between state variables**********/
gen first_state_school=first_col_state_nsc if (f_enroll_NSC_new==1 | f_enroll_grad_new==1);
replace first_state_school=first_col_state_nslds if f_enroll_NSLDS_new==1;
replace first_state_school=first_col_state_pell if f_enroll_pell_new==1;


compress;
/*Now I need to merge the tuition*/
rename first_col_state_earliest_new first_state;
sort first_state;
merge m:1 first_state using "`pathtab'/Tuition_state_pub4_pub2_priv4.dta";
drop if _merge==2;
drop _merge;
/*Becareful! Changing the name of the state variable to shorten it*/
rename first_state first_col_state_earl_new;
rename tuition* t*_col_state_earl_new;
sort pid year;

rename firststate23 first_state; /*to merge based on TU information, if info happens before age 23*/
sort first_state;
merge m:1 first_state using "`pathtab'/Tuition_state_pub4_pub2_priv4.dta";
drop if _merge==2;
drop _merge;
rename first_state firststate23;
rename tuition* t*_firststate23;
sort pid year;

/*Now merging unemployment rates at the state level*/
rename first_col_state_earl_new state_u;
merge m:1 state_u using "`pathtab'/urate_state";
drop if _merge==2;
drop _merge;
rename state_u first_col_state_earl_new;


forvalues yo=22(1)40 {;
gen u_rate`yo'=.;
forvalues year=1990(1)2014 {;
replace u_rate`yo'=urate`year' if `year'-dob_y==`yo';
};
};
drop urate1990-urate2014;

/*******NEW-Feb 2018******/
/*Now merging unemployment rates at the state level, based on TU info for age 23 or less*/
rename firststate23 state_u;
merge m:1 state_u using "`pathtab'/urate_state";
drop if _merge==2;
drop _merge;
rename state_u firststate23;


forvalues yo=22(1)40 {;
gen u_rate_firststate23_`yo'=.;
forvalues year=1990(1)2014 {;
replace u_rate_firststate23_`yo'=urate`year' if `year'-dob_y==`yo';
};
};
drop urate1990-urate2014;

/*Now merging wages at the state level*/
rename first_col_state_earl_new state_w;
sort state_w;
merge m:1 state_w using "`pathtab'/state_wages_tomerge.dta";
drop if _merge==2;
drop _merge;

forvalues yo=22(1)40 {;
gen wages`yo'=.;
forvalues year=1990(1)2014 {;
replace wages`yo'=avg_wkly_wage_`year'_3 if `year'-dob_y==`yo' & dob_m<=3;
replace wages`yo'=avg_wkly_wage_`year'_6 if `year'-dob_y==`yo' & dob_m>=4 & dob_m<=6;
replace wages`yo'=avg_wkly_wage_`year'_9 if `year'-dob_y==`yo' & dob_m>=7 & dob_m<=9;
replace wages`yo'=avg_wkly_wage_`year'_12 if `year'-dob_y==`yo' & dob_m>=10 & dob_m<=12;
};
};

drop avg_wkly_wage_199*_* avg_wkly_wage_20*_*;

/*********
Corelogic
**********/
rename state_w state_c;
merge m:1 state_c using "`path4'/corelogic_final.dta";
drop y_197*_* y_198*_*;
drop _merge;
forvalues yo=22(1)40 {;
gen corelogic`yo'=.;
forvalues year=1993(1)2012 {;
forvalues month=1(1)9 {;
replace corelogic`yo'=y_`year'_0`month' if `year'-dob_y==`yo' & dob_m==`month';
};
forvalues month=10(1)12 {;
replace corelogic`yo'=y_`year'_`month' if `year'-dob_y==`yo' & dob_m==`month';
};
};
};
drop y_199*_* y_20*_*;
rename state_c first_col_state_earl_new;

sort pid year;

/*******NEW-Feb 2018******/
/*Now merging wages at the state level from TU before age 23*/
rename firststate23 state_w;
sort state_w;
merge m:1 state_w using "`pathtab'/state_wages_tomerge.dta";
drop if _merge==2;
drop _merge;

forvalues yo=22(1)40 {;
gen wages_firststate23_`yo'=.;
forvalues year=1990(1)2014 {;
replace wages_firststate23_`yo'=avg_wkly_wage_`year'_3 if `year'-dob_y==`yo' & dob_m<=3;
replace wages_firststate23_`yo'=avg_wkly_wage_`year'_6 if `year'-dob_y==`yo' & dob_m>=4 & dob_m<=6;
replace wages_firststate23_`yo'=avg_wkly_wage_`year'_9 if `year'-dob_y==`yo' & dob_m>=7 & dob_m<=9;
replace wages_firststate23_`yo'=avg_wkly_wage_`year'_12 if `year'-dob_y==`yo' & dob_m>=10 & dob_m<=12;
};
};

drop avg_wkly_wage_199*_* avg_wkly_wage_20*_*;

/*******NEW-Feb 2018******/
/*********
Corelogic
**********/
rename state_w state_c;
merge m:1 state_c using "`path4'/corelogic_final.dta";  /*propriety data with house prices at the state level*/
drop y_197*_* y_198*_*;
forvalues yo=22(1)40 {;
gen corelogic_firststate23_`yo'=.;
forvalues year=1993(1)2012 {;
forvalues month=1(1)9 {;
replace corelogic_firststate23_`yo'=y_`year'_0`month' if `year'-dob_y==`yo' & dob_m==`month';
};
forvalues month=10(1)12 {;
replace corelogic_firststate23_`yo'=y_`year'_`month' if `year'-dob_y==`yo' & dob_m==`month';
};
};
};
drop y_199*_* y_20*_*;
rename state_c firststate23;

sort pid year;



/*Now I need to define in which academic year the person started school.
*/
gen year_earl_enrollment_new_acad=year_earliest_enrollment_new if month_earliest_enrollment_new>=7;
replace year_earl_enrollment_new_acad=year_earliest_enrollment_new-1 if month_earliest_enrollment_new<7;



*drop quarter;
*drop t*_col* t*_firststate;

drop period_earliest_enrollment 
period_earliest_enrollment_new withzip_t withzip firstzip_t period_firstzip_t 
withcounty_t withcounty firstcounty_t period_firstcounty_t 
withtract_t withtract firsttract_t period_firsttract_t 
withbg_t withbg firstbg_t period_firstbg_t 
withstate_t withstate firststate_t period_firststate_t;

drop age_earliest_enrollment_t age_earliest_enrollment_new_t 
dif_age_zip max_dif_age_zip dif_age_zip_new max_dif_age_zip_new 
dif_age_county max_dif_age_county dif_age_county_new max_dif_age_county_new 
dif_age_tract max_dif_age_tract dif_age_tract_new max_dif_age_tract_new 
dif_age_bg max_dif_age_bg dif_age_bg_new max_dif_age_bg_new 
dif_age_state dif_age_state_new ;

drop earliest_enrollment_nopell earliest_enrollment_nopell_noTU;

drop state_fips 
state_county missing_state missing_county cmsa msa avg_agi1998 avg_agi2001 
avg_agi2002 avg_agi2004 avg_agi2005 avg_agi2006 avg_agi2007 avg_agi2008 
avg_agi2009 avg_agi2010 avg_agi2011 avg_agi2012 first_col_opeid_nsc 
first_col_enroll_nsc  first_col_from_grad_nsc first_col_opeid_nslds 
first_col_enroll_nslds first_col_state_fips_nslds 
first_col_sector_nslds award_yr_pell first_col_opeid_pell  
first_col_state_fips_pell first_col_sector_pell;


keep if year==2004;
drop year;
compress;
save "`out'/dataset_for_regs_new_RandR_noCB.dta", replace;

*erase "`path3'/zillow_hpi_tomerge.dta";
*erase "`path3'/cl_hpi_tomerge.dta";
*erase "`path3'/census_info_tract.dta";
*erase "`path3'/census_info_county.dta";
*erase "`path3'/census_info_state.dta";
*erase "`path3'/census_info_first_col_state.dta";
*erase "`path3'/census_info_first_col_state_new.dta";
*erase "`path3'/census_info_first_col_state_earliest.dta";
*erase "`path3'/census_info_first_col_state_earliest_new.dta";
*erase "`path3'/urate_county.dta";
erase "`pathtab'/state_wages_tomerge.dta";
*erase "`path3'/state_wages_tomerge_first_col_state.dta";
*erase "`path3'/state_wages_tomerge_first_col_state_new.dta";
*erase "`path3'/state_wages_tomerge_first_col_state_earliest.dta";
*erase "`path3'/state_wages_tomerge_first_col_state_earliest_new.dta";
*erase "`path3'/county_wages_tomerge.dta";
erase "`pathtab'/urate_state.dta";
erase "`pathtab'/zillow_state.dta";
*erase "`path1'/cpi_data_all_wages.dta";
erase "`path1'/cpi_data_all_tuition.dta";


